## Incumbency data creation from raw data handcoded from kmd.dk
rm(list=ls(all=TRUE))
library(dplyr)

setwd() # set working directory

##Due to different manual coding data on boards and who is elected
##is first sourced in and set up through a different file. 

board.2005 <- read.table("data/raw.2005.boards.csv", sep=";",header=TRUE)[,1:4]
raw.2005  <- read.table("data/raw.2005.cand.csv", sep=";",header=TRUE)[,1:13]


##Removing any information about location from candidate name for board list. 
##Identify parties by only their list letter in the boards
names <- strsplit(as.character(board.2005$candname), ",")
parties <- strsplit(as.character(board.2005$candpart),",")
cand.names <- rep("NA",dim(board.2005)[1])
cand.part <- rep("NA",dim(board.2005)[1])

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
  cand.part[i]  <- parties[[i]][1]
}

board.2005$candname <- cand.names
board.2005$candpart <- cand.part


##Removing any information about location from candidate name for candidate list. 
names <- strsplit(as.character(raw.2005$candname), ",")
cand.names <- rep("NA",dim(raw.2005)[1])

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
}

raw.2005$candname <- cand.names

raw.2005$candmano <- NULL

##Two individuals in Ringk?bing-Skjern has the same name and run for the same party
##in 2005. Since none of them are in the 2001 or 2009 data, they are both deleted
##The same apply to two individuals in Faxe in 2005. Both are deleted.

dupli1 <- duplicated(raw.2005[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(raw.2005[,c("muncpr","candname","candpart")], fromLast = TRUE)

raw.2005 <- subset(raw.2005, (dupli1+dupli2)==0)

dupli1 <- duplicated(board.2005[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(board.2005[,c("muncpr","candname","candpart")], fromLast = TRUE)

board.2005 <- subset(board.2005, (dupli1+dupli2)==0)

raw.2005 <- merge(raw.2005, board.2005, by=c("muncpr","candname","candpart"), all = TRUE)
raw.2005 <- raw.2005[!is.na(raw.2005$candvote),]

##add dummy for election year
raw.2005$year <- 2005
rm(cand.names,names,i,dupli1,dupli2,parties,cand.part)

##Generating variable for being elected

raw.2005$elected <- as.numeric(raw.2005$candmano>0)
raw.2005$elected[is.na(raw.2005$elected)] <- 0

##################################
##################################
##repeat for 2009 and 2013######## 
##################################
##################################

board.2009 <- read.table("data/raw.2009.boards.csv", sep=";",header=TRUE)[,1:4]
raw.2009   <- read.table("data/raw.2009.cand.csv", sep=";",header=TRUE)[,1:13]

names <- strsplit(as.character(board.2009$candname), ",")
parties <- strsplit(as.character(board.2009$candpart),",")
cand.names <- rep("NA",dim(board.2009)[1])
cand.part <- rep("NA",dim(board.2009)[1])

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
  cand.part[i]  <- parties[[i]][1]
}

board.2009$candname <- cand.names
board.2009$candpart <- cand.part

##Removing any information about location from candidate name for candidate list. 
names      <- strsplit(as.character(raw.2009$candname), ",")
cand.names <- rep("NA",dim(raw.2009)[1])

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
}

raw.2009$candname <- cand.names

raw.2009$candmano <- NULL

##remove any duplicates
dupli1 <- duplicated(raw.2009[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(raw.2009[,c("muncpr","candname","candpart")], fromLast = TRUE)

raw.2009 <- subset(raw.2009, (dupli1+dupli2)==0)

dupli1 <- duplicated(board.2009[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(board.2009[,c("muncpr","candname","candpart")], fromLast = TRUE)

board.2009 <- subset(board.2009, (dupli1+dupli2)==0)

raw.2009 <- merge(raw.2009, board.2009, by=c("muncpr","candname","candpart"), all = TRUE)
raw.2009 <- raw.2009[!is.na(raw.2009$candvote),]
##add dummy for election year
raw.2009$year <- 2009
rm(cand.names,names,i,dupli1,dupli2,parties,cand.part)

##Generating variable for being elected

raw.2009$elected <- as.numeric(raw.2009$candmano>0)
raw.2009$elected[is.na(raw.2009$elected)] <- 0

#### 2013 ####

board.2013 <- read.table("data/raw.2013.boards.csv", sep=";",header=TRUE)[,1:4]
raw.2013  <- read.table("data/raw.2013.cand.csv", sep=";",header=TRUE)[,1:13]

raw.2013$candmano2013 <- NULL

names <- strsplit(as.character(board.2013$candname), ",")
parties <- strsplit(as.character(board.2013$candpart),",")
cand.names <- rep("NA",dim(board.2013)[1])
cand.part <- rep("NA",dim(board.2013)[1]) 

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
  cand.part[i]  <- parties[[i]][1]
} 

board.2013$candname <- cand.names
board.2013$candpart <- cand.part

##Removing any information about location from candidate name for candidate list. 
names <- strsplit(as.character(raw.2013$candname), ",")
cand.names <- rep("NA",dim(raw.2013)[1])

for (i in 1:length(cand.names)){
  cand.names[i] <- names[[i]][1]
}

raw.2013$candname <- cand.names

raw.2013$candmano <- NULL

##remove any duplicates
dupli1 <- duplicated(raw.2013[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(raw.2013[,c("muncpr","candname","candpart")], fromLast = TRUE)

raw.2013 <- subset(raw.2013, (dupli1+dupli2)==0)

dupli1 <- duplicated(board.2013[,c("muncpr","candname","candpart")])
dupli2 <- duplicated(board.2013[,c("muncpr","candname","candpart")], fromLast = TRUE)

board.2013 <- subset(board.2013, (dupli1+dupli2)==0)

raw.2013 <- left_join(raw.2013, board.2013)
raw.2013 <- raw.2013[!is.na(raw.2013$candvote),]
##add dummy for election year
raw.2013$year <- 2013
rm(cand.names,names,i,dupli1,dupli2,parties,cand.part)

##Generating variable for being elected

raw.2013$elected <- as.numeric(raw.2013$candmano)>0
raw.2013$elected[is.na(raw.2013$elected)] <- 0

##Generating variable for being elected

raw.2013$elected <- as.numeric(raw.2013$candmano)>0  
raw.2013$elected[is.na(raw.2013$elected)] <- 0

rm( board.2005, board.2009, board.2013)


##Create thresholds

pthres <- function (mun,party,pervotes,elected) {
  up  <- aggregate(pervotes, list(party, mun, elected), FUN=min)
  low <- aggregate(pervotes, list(party, mun, elected), FUN=max)
  up  <- up[up$Group.3==1,]
  low <- low[low$Group.3==0,]
  
  sample <- merge(up,low,by=c("Group.2","Group.1"),all=TRUE)
  sample <- sample[!is.na(sample$x.x)==1,]
  sample$threshold <- (sample$x.x-sample$x.y)/2+sample$x.y
  sample <- sample[,c(1,2,4,6,7)]
  colnames(sample) <- c("muncpr", "candpart2","marg.win","marg.los","marg.thres")
  return(sample)
}


out <- pthres(raw.2005$muncpr,raw.2005$candpart,raw.2005$candvote,raw.2005$elected)
raw.2005$candpart2 <- raw.2005$candpart
raw.2005 <- merge(raw.2005,out,by=c("muncpr","candpart2"),all.x=TRUE)

out <- pthres(raw.2009$muncpr,raw.2009$candpart,raw.2009$candvote,raw.2009$elected)
raw.2009$candpart2 <- raw.2009$candpart
raw.2009 <- merge(raw.2009,out,by=c("muncpr","candpart2"),all.x=TRUE)

out <- pthres(raw.2013$muncpr,raw.2013$candpart,raw.2013$candvote,raw.2013$elected)
raw.2013$candpart2 <- (raw.2013$candpart)
raw.2013 <- left_join(raw.2013,out,by=c("muncpr","candpart2"),all.x=TRUE)

raw.2009.join <- raw.2009[c("muncpr", "candpart2", "candname", "elected")]
colnames(raw.2009.join)[4] <- "electedt1"
raw.2005 <- left_join(raw.2005, raw.2009.join)

raw.2013.join <- raw.2013[c("muncpr", "candpart2", "candname", "elected")]
colnames(raw.2013.join)[4] <- "electedt1"
raw.2009 <- left_join(raw.2009, raw.2013.join)

data <- data.frame(rbind(raw.2005,raw.2009))

#changing candidate votes to numeric
data$candvote <- as.numeric(data$candvote)
data <- data.frame(data[is.na(data$candvote)==FALSE,])

#create forcing variables
data$z1 <- (data$candvote - data$marg.thres) / data$marg.thres

data$rerun <- ifelse(is.na(data$electedt1),0,1)
data$electedt1[is.na(data$electedt1)==TRUE] <- 0

data$splitlist[is.na(data$splitlist)] <- 0 
data <- data[data$openlist == 1,]
data <- data[data$splitlist != 1,]

save(data, file = "data/data.rdata")
